{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The raw data for the Adult dataset is located at the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/adult).\n",
    "\n",
    "# Download raw data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 3881k  100 3881k    0     0   152k      0  0:00:25  0:00:25 --:--:--  170k:--     0\n",
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 1956k  100 1956k    0     0  54508      0  0:00:36  0:00:36 --:--:--  127k\n"
     ]
    }
   ],
   "source": [
    "!curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data > train_data_raw.csv\n",
    "!curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test | tail -n +2 > test_data_raw.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read raw data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.insert(0, '../..')\n",
    "\n",
    "import pandas as pd\n",
    "from data import CVCTRImputer\n",
    "\n",
    "\n",
    "def adult_data_to_documents_and_labels(path):\n",
    "    data_raw = pd.read_csv(path, sep=',\\s', na_values=['?'], header=None)\n",
    "    return data_raw.iloc[:, :-1], data_raw.iloc[:, -1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/bshar/anaconda/envs/influence_boosting2/lib/python2.7/site-packages/ipykernel_launcher.py:9: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n",
      "  if __name__ == '__main__':\n"
     ]
    }
   ],
   "source": [
    "train_documents_raw, train_labels_raw = adult_data_to_documents_and_labels('train_data_raw.csv')\n",
    "test_documents_raw, test_labels_raw = adult_data_to_documents_and_labels('test_data_raw.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocess data\n",
    "- Binarize targets\n",
    "- Replace categorical features with CTRs, for cross-validation (train set) and whole train set (test set0\n",
    "- Replace NaNs/missing values with column means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_labels = (train_labels_raw == '>50K').astype(int)\n",
    "test_labels = (test_labels_raw == '>50K.').astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "imputer = CVCTRImputer(cat_features_idxs=[1, 3, 5, 6, 7, 8, 9, 13], n_folds=10)\n",
    "imputer.fit(train_documents_raw, train_labels)\n",
    "train_documents = imputer.transform_train(train_documents_raw)\n",
    "test_documents = imputer.transform_test(test_documents_raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_set_means = {feature_id: train_documents.iloc[:, feature_id].mean() for feature_id in train_documents.columns}\n",
    "train_documents = train_documents.fillna(train_set_means)\n",
    "test_documents = test_documents.fillna(train_set_means)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_in_catboost_format(documents, labels, output_path, cd_path=None):\n",
    "    labels_and_documents = pd.concat([labels, documents], axis=1)\n",
    "    labels_and_documents.to_csv(output_path, sep='\\t', header=None, index=None)\n",
    "    if cd_path is not None:\n",
    "        with open(cd_path, 'w') as f:\n",
    "            f.write('0\\tTarget\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_in_catboost_format(train_documents, train_labels, 'train_data_catboost_format.tsv', 'cd')\n",
    "# Also save features only\n",
    "train_documents.to_csv('train_documents.tsv', sep='\\t', header=None, index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_in_catboost_format(test_documents, test_labels, 'test_data_catboost_format.tsv')\n",
    "# Also save features only\n",
    "test_documents.to_csv('test_documents.tsv', sep='\\t', header=None, index=None)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
